www.gusucode.com > wxApp PHP版微信小程序CMS系统 v1.0PHP源码程序 > wxApp PHP版微信小程序CMS系统 v1.0/wxAppCMS_v1.0.0/wxAppCMS_v1.0.0/app/spider/spider_data.class.php

    <?php
/**
* iCMS - i Content Management System
* Copyright (c) 2007-2017 iCMSdev.com. All rights reserved.
*
* @author icmsdev <master@icmsdev.com>
* @site https://www.icmsdev.com
* @licence https://www.icmsdev.com/LICENSE.html
*/
defined('iPHP') OR exit('What are you doing?');

class spider_data {

    public static function crawl($_pid = NULL,$_rid = NULL,$_url = NULL,$_title = NULL) {
        @set_time_limit(0);
        $sid = spider::$sid;
        if ($sid) {
            $sRs   = iDB::row("SELECT * FROM `#iCMS@__spider_url` WHERE `id`='$sid' LIMIT 1;");
            $title = $sRs->title;
            $cid   = $sRs->cid;
            $pid   = $sRs->pid;
            $url   = $sRs->url;
            $rid   = $sRs->rid;
       } else {
            $rid   = spider::$rid;
            $pid   = spider::$pid;
            $title = spider::$title;
            $url   = spider::$url;

            $_rid   === NULL OR $rid = $_rid;
            $_pid   === NULL OR $pid = $_pid;
            $_title === NULL OR $title = $_title;
            $_url   === NULL OR $url = $_url;
        }

        if($pid){
            $project        = spider::project($pid);
            $prule_list_url = $project['list_url'];
        }

        $ruleA           = spider::rule($rid);
        $rule            = $ruleA['rule'];
        $dataArray       = $rule['data'];

        if($prule_list_url){
            $rule['list_url']   = $prule_list_url;
        }

        if (spider::$dataTest) {
            echo "<b>抓取规则信息</b><pre style='max-height:300px;overflow-y: scroll;'>";
            print_r(iSecurity::escapeStr($ruleA));
            print_r(iSecurity::escapeStr($project));
            echo "</pre><hr />";
        }

        $rule['proxy'] && spider::$curl_proxy = $rule['proxy'];
        $rule['data_charset'] && spider::$charset = $rule['data_charset'];

        $responses = array();
        $html      = spider_tools::remote($url);
        if(empty($html)){
            $msg = '错误:001..采集 ' . $url . '文件内容为空!请检查采集规则';
            $msg.= var_export(spider_tools::$curl_info,true);
            if(spider::$work=='shell'){
                echo spider::errorlog("{$msg}\n",$url,'data.empty',array('pid'=>$pid,'sid'=>$sid,'rid'=>$rid));
                return false;
            }else{
                iUI::alert($msg);
            }
        }

//      $http   = spider::check_content_code($html);
//
//      if($http['match']==false){
//          return false;
//      }
//      $content        = $http['content'];
        spider::$allHtml        = array();
        $rule['__url__']        = spider::$url;
        $responses['reurl']     = spider::$url;
        $responses['__title__'] = $title;
        foreach ((array)$dataArray AS $key => $data) {

            $content_html = $html;
            $dname = $data['name'];
            /**
             * [UNSET:name]
             * 注销[name]
             * @var string
             */
            if (strpos($dname,'UNSET:')!== false){
                $_dname = str_replace('UNSET:', '', $dname);
                unset($responses[$_dname]);
                continue;
            }
            /**
             * [DATA:name]
             * 把之前[name]处理完的数据当作原始数据
             * 如果之前有数据会叠加
             * 用于数据多次处理
             * @var string
             */
            if (strpos($dname,'DATA:')!== false){
                $_dname = str_replace('DATA:', '', $dname);
                $content_html = $responses[$_dname];
                unset($responses[$dname]);
            }
            /**
             * [PRE:name]
             * 把PRE:name采集到的数据 当做原始数据
             * 一般用于下载内容
             * @var string
             */
            $pre_dname = 'PRE:'.$dname;
            if(isset($responses[$pre_dname])){
                $content_html = $responses[$pre_dname];
                unset($responses[$pre_dname]);
            }
            /**
             * [EMPTY:name]
             * 如果[name]之前抓取结果数据为空使用这个数据项替换
             * @var string
             */
            if (strpos($dname,'EMPTY:')!== false){
                $_dname = str_replace('EMPTY:', '', $dname);
                if(empty($responses[$_dname])){
                    $dname = $_dname;
                }else{
                    //有值不执行抓取
                    continue;
                }
            }
            $content = spider_content::crawl($content_html,$data,$rule,$responses);
            if($content === null){
                $responses[$key] = null;
                continue;
            }
            unset($content_html);

            if (strpos($dname,'ARRAY:')!== false){
                $dname = str_replace('ARRAY:', '', $dname);
                $cArray = array();

                foreach ((array)$content as $k => $value) {
                    foreach ((array)$value as $key => $val) {
                        $cArray[$key][$k]=$val;
                    }
                }
                if($cArray){
                    $content = $cArray;
                    unset($cArray);
                }
            }

            /**
             * [name.xxx]
             * 采集内容做为数组
             */
            if (strpos($dname,'.')!== false){
                $f_key = substr($dname,0,stripos($dname, "."));
                $s_key = substr(strrchr($dname, "."), 1);
                // $responses = self::create_multi_array($dname,$content);
                if(isset($responses[$f_key][$s_key])){
                    if(is_array($responses[$f_key][$s_key])){
                        $responses[$f_key][$s_key] = array_merge((array)$responses[$f_key][$s_key],(array)$content);
                    }else{
                        $responses[$f_key][$s_key].= $content;
                    }
                }else{
                    $responses[$f_key][$s_key] = $content;
                }
            }else{
                /**
                 * 多个name 内容合并
                 */
                if(isset($responses[$dname])){
                    if(is_array($responses[$dname])){
                        $responses[$dname] = array_merge((array)$responses[$dname],(array)$content);
                    }else{
                        $responses[$dname].= $content;
                    }
                }else{
                    $responses[$dname] = $content;
                }
            }
            /**
             * 对匹配多条的数据去重过滤
             */
            if(!is_array($responses[$dname]) && $data['multi']){
                if(strpos($responses[$dname], ',')!==false){
                    $_dnameArray = explode(',', $responses[$dname]);
                    $dnameArray  = array();
                    foreach ((array)$_dnameArray as $key => $value) {
                        $value = trim($value);
                        $value && $dnameArray[]=$value;
                    }
                    $dnameArray = array_filter($dnameArray);
                    $dnameArray = array_unique($dnameArray);
                    $responses[$dname] = implode(',', $dnameArray);
                    unset($dnameArray,$_dnameArray);
                }
            }

            gc_collect_cycles();
        }
        foreach ($responses as $key => $value) {
            if(strpos($key, ':')!==false){
                unset($responses[$key]);
            }
        }
        if(isset($responses['title']) && empty($responses['title'])){
            $responses['title'] = $responses['__title__'];
        }
        spider::$allHtml = array();
        unset($html);

        gc_collect_cycles();

        if (spider::$dataTest) {
            echo "<b>最终采集结果:</b>";
            echo "<pre style='width:99%;word-wrap: break-word;white-space: pre-wrap;'>";
            print_r(iSecurity::escapeStr($responses));
            echo '<hr />';
            echo '使用内存:'.iFS::sizeUnit(memory_get_usage()).' 执行时间:'.iPHP::timer_stop().'s';
            echo "</pre>";
        }

        self::set_watermark_config($rule);

        if (spider::$callback['data'] && is_callable(spider::$callback['data'])) {
            $responses = call_user_func_array(spider::$callback['data'],array($responses));
        }

        return $responses;
    }
    public static function set_watermark_config($rule){
        iHttp::$CURLOPT_ENCODING        = '';
        iHttp::$CURLOPT_REFERER         = '';
        files::$watermark_config['pos'] = iCMS::$config['watermark']['pos'];
        files::$watermark_config['x']   = iCMS::$config['watermark']['x'];
        files::$watermark_config['y']   = iCMS::$config['watermark']['y'];
        files::$watermark_config['img'] = iCMS::$config['watermark']['img'];
        files::$watermark_enable        = iCMS::$config['watermark']['enable'];

        $rule['fs']['encoding'] && iHttp::$CURLOPT_ENCODING = $rule['fs']['encoding'];
        $rule['fs']['referer']  && iHttp::$CURLOPT_REFERER  = $rule['fs']['referer'];
        if($rule['watermark_mode']){
            files::$watermark_config['pos'] = $rule['watermark']['pos'];
            files::$watermark_config['x']   = $rule['watermark']['x'];
            files::$watermark_config['y']   = $rule['watermark']['y'];
            $rule['watermark']['img'] && files::$watermark_config['img'] = $rule['watermark']['img'];
        }
        if($rule['watermark_mode']=="2"){
            files::$watermark_enable = false;
        }
    }
    public static function create_multi_array($string,$value=null){
        $a_array = explode('.', $string);
        krsort ( $a_array );
        $count = count($a_array);
        $a = $value;
        foreach ($a_array as $k => $v) {
            $a = array($v=>$a);
            if(count($a)>1){
                array_shift($a);
            }
        }
        return $a;
    }
}